clear

************************************************************************************
* DESCRIPTION: This script processes CSV data into Stata format, 
* prepares variables, and aggregates all the information at the firm-level.
************************************************************************************

/* Start logging */
capture log close
log using "$logs\csv_to_date_ifp.log", replace

*************************************************************
* Load and clean data 
*************************************************************

/* Import dataset from an external source */
import delimited "$input\SCR_CIB_3_months_orbis_scrapped.csv", clear

/* Convert date variable to Stata format */
gen time = date(date, "YMD")
format time %td
gen mdate = mofd(time)  // Convert to month format

/* Rename loan-related variables for clarity */
rename loanst stloan
rename loanlt ltloan
rename loan outsideloan
rename datestart outsideloandate

/* Extract rating category by removing the first letter */
gen rating10 = substr(rating, 2, 4)
tab rating10 p2p

/* Standardize rating categories */
replace rating10 = "3A" if rating10 == "3++"
replace rating10 = "3B" if rating10 == "3+"
replace rating10 = "3C" if rating10 == "3"
replace rating10 = "4A" if rating10 == "4+"
replace rating10 = "4B" if rating10 == "4"
replace rating10 = "5A" if rating10 == "5+"
replace rating10 = "5B" if rating10 == "5"

drop rating
encode rating10, gen(rating)

/* Save the cleaned dataset */
save "$output\SCR_CIB_3_months_orbis_scrapped.dta", replace

/* Filter out new outside loans */
keep if newoutside == 0

/* Aggregate loan data at the firm-level */
collapse (sum) drawn available stloan ltloan co bm bi oc, ///
    by(date siren outsideloan loantype rate maturity p2p year rating industry ///
    activity region size outsideloandate contrancollateral forinvestment post ///
    deltamonths r1 r1filled r2 r3 r3filled r4 r5 r5filled r6 r7 r7filled ///
    r8 r8filled r9 r9filled r10 r11 r12 r12filled r12bis r13 r13filled r14 ///
    r15 r16 r17 r_collateral r_collateral_filled r_collateral2 r_inno1 r_inno2 ///
    r_inno3 rdcostsfilled employees assets assets_filled age birthday investment ///
    salaires va sales fiben mdate roe roa profitmargin npm gpm ageenmois agefilled ///
    account_receivables_filled earlyrepayment earlyrepaymentless6 ///
    earlyrepaymentless12 contrancollateral pclopen pclliquidate less6 immo_corpo ///
    ageatevent dettefiscta dettefouta outlyingplatform inventory cash creclie ///
    dettefou dettefisc otherwc)

/* Compute logarithmic transformations of financial variables */
gen logassets = log(assets)
gen logassets_filled = log(assets_filled)

/* Compute loan timing variables */
gen outloandate = date(outsideloandate, "YMD")
gen outloanmonth = month(outloandate)
gen outloanquarter = quarter(outloandate)
gen outloanyear = year(outloandate)

gen datestata = date(date, "YMD")
gen quarterdat = quarter(datestata)
gen datemonth = month(datestata)

gen outloantime = outloanyear * 100 + outloanmonth  // Used for matching
gen outloanqtime = outloanyear * 10 + outloanquarter

gen delta = (year - outloanyear) * 12 + (datemonth - outloanmonth)

/* Sort and reorder data */
sort siren year quarterdat delta date 
order siren year delta date p2p industry region size outsideloan outsideloandate

/* Save aggregated dataset */
save "$output\ifp_scrapped_aggregated.dta", replace

*************************************************************
* Industry classification adjustments
*************************************************************

/* Group industries into broader sectors */
replace industry = "public" if inlist(industry, "P", "Q", "R", "S")  // public and quasi-public sector
replace industry = "DE" if inlist(industry, "D", "E")  // Merge D & E
replace industry = "FL" if inlist(industry, "F", "L")  // Merge F & L

/* Exclude unwanted industries */
drop if industry == "A"   // Agriculture
drop if industry == "K"   // Finance and insurance
drop if industry == "O"   // Public administration
drop if industry == "B"   // Industry B
drop if industry == "DE"  // Not enough firms in DE category
drop if industry == "0000Z"  // Industry with little information

/* Save dataset with modified industries */
save "$output\ifp_scrapped_aggregated_industries_modified.dta", replace

/* Close log file */
log close
